For this assignment, the following libraries are used:
library(GGally)
library(ggplot2)
library(dplyr)
library(gridExtra)
library(ggcorrplot)
library(dendextend)
library(dbscan)
library(factoextra)
library(mclust)
library(cluster)
library(igraph)
library(clValid)
library(cluster)
dataset <- read.csv("../datasets/penguindata.csv")
summary(dataset)
## X bill_length_mm bill_depth_mm flipper_length_mm
## Min. : 1.00 Min. :32.10 Min. :13.10 Min. :172.0
## 1st Qu.: 86.75 1st Qu.:39.23 1st Qu.:15.60 1st Qu.:190.0
## Median :172.50 Median :44.45 Median :17.30 Median :197.0
## Mean :172.50 Mean :43.92 Mean :17.15 Mean :200.9
## 3rd Qu.:258.25 3rd Qu.:48.50 3rd Qu.:18.70 3rd Qu.:213.0
## Max. :344.00 Max. :59.60 Max. :21.50 Max. :231.0
## NA's :2 NA's :2 NA's :2
## body_mass_g sex year
## Min. :2700 Length:344 Min. :2007
## 1st Qu.:3550 Class :character 1st Qu.:2007
## Median :4050 Mode :character Median :2008
## Mean :4202 Mean :2008
## 3rd Qu.:4750 3rd Qu.:2009
## Max. :6300 Max. :2009
## NA's :2
str(dataset)
## 'data.frame': 344 obs. of 7 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ bill_length_mm : num 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
## $ bill_depth_mm : num 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
## $ flipper_length_mm: int 181 186 195 NA 193 190 181 195 193 190 ...
## $ body_mass_g : int 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
## $ sex : chr "male" "female" "female" NA ...
## $ year : int 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
df <- dataset
df <- subset(df, select = -X)
df <- df %>% mutate(bill_length_mm = ifelse(is.na(bill_length_mm), mean(bill_length_mm, na.rm = TRUE), bill_length_mm))
df <- df %>% mutate(bill_depth_mm = ifelse(is.na(bill_depth_mm), mean(bill_depth_mm, na.rm = TRUE), bill_depth_mm))
df <- df %>% mutate(flipper_length_mm = ifelse(is.na(flipper_length_mm), mean(flipper_length_mm, na.rm = TRUE), flipper_length_mm))
df <- df %>% mutate(body_mass_g = ifelse(is.na(body_mass_g), mean(body_mass_g, na.rm = TRUE), body_mass_g))
df <- df %>% mutate(sex = ifelse(is.na(sex), median(sex, na.rm = TRUE), sex))
df <- df %>% mutate(year = ifelse(is.na(year), median(year, na.rm = TRUE), year))
summary(df)
## bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## Min. :32.10 Min. :13.10 Min. :172.0 Min. :2700
## 1st Qu.:39.27 1st Qu.:15.60 1st Qu.:190.0 1st Qu.:3550
## Median :44.25 Median :17.30 Median :197.0 Median :4050
## Mean :43.92 Mean :17.15 Mean :200.9 Mean :4202
## 3rd Qu.:48.50 3rd Qu.:18.70 3rd Qu.:213.0 3rd Qu.:4750
## Max. :59.60 Max. :21.50 Max. :231.0 Max. :6300
## sex year
## Length:344 Min. :2007
## Class :character 1st Qu.:2007
## Mode :character Median :2008
## Mean :2008
## 3rd Qu.:2009
## Max. :2009
# sex 1,2
# year 1,2,3
df <- df %>% mutate(sex = ifelse(sex == "male", 1, 2))
# [0,1]
df$year <- (df$year - min(df$year)) / (max(df$year) - min(df$year))
summary(df)
## bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## Min. :32.10 Min. :13.10 Min. :172.0 Min. :2700
## 1st Qu.:39.27 1st Qu.:15.60 1st Qu.:190.0 1st Qu.:3550
## Median :44.25 Median :17.30 Median :197.0 Median :4050
## Mean :43.92 Mean :17.15 Mean :200.9 Mean :4202
## 3rd Qu.:48.50 3rd Qu.:18.70 3rd Qu.:213.0 3rd Qu.:4750
## Max. :59.60 Max. :21.50 Max. :231.0 Max. :6300
## sex year
## Min. :1.00 Min. :0.0000
## 1st Qu.:1.00 1st Qu.:0.0000
## Median :1.00 Median :0.5000
## Mean :1.48 Mean :0.5145
## 3rd Qu.:2.00 3rd Qu.:1.0000
## Max. :2.00 Max. :1.0000
str(df)
## 'data.frame': 344 obs. of 6 variables:
## $ bill_length_mm : num 39.1 39.5 40.3 43.9 36.7 ...
## $ bill_depth_mm : num 18.7 17.4 18 17.2 19.3 ...
## $ flipper_length_mm: num 181 186 195 201 193 ...
## $ body_mass_g : num 3750 3800 3250 4202 3450 ...
## $ sex : num 1 2 2 1 2 1 2 1 1 1 ...
## $ year : num 0 0 0 0 0 0 0 0 0 0 ...
Once the data is prepared, it’s ready to be used in the models. In this section, each model is given three random subsets of the cleaned dataset to classify as one of the class labels. The models are as follows:
Each run of the model will output the confusion matrix and the evaluation of each run. Later on, it will be studied and commented on in the Results section. Additionally, the results of each run will be displayed immediately after every model, as shown in the following sections for each model.
set.seed(2828)
penguins <- df
penguins$bill_length_mm <- as.numeric(penguins$bill_length_mm)
penguins$bill_depth_mm <- as.numeric(penguins$bill_depth_mm)
penguins$flipper_length_mm <- as.numeric(penguins$flipper_length_mm)
penguins$body_mass_g <- as.numeric(penguins$body_mass_g)
summary(penguins)
## bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## Min. :32.10 Min. :13.10 Min. :172.0 Min. :2700
## 1st Qu.:39.27 1st Qu.:15.60 1st Qu.:190.0 1st Qu.:3550
## Median :44.25 Median :17.30 Median :197.0 Median :4050
## Mean :43.92 Mean :17.15 Mean :200.9 Mean :4202
## 3rd Qu.:48.50 3rd Qu.:18.70 3rd Qu.:213.0 3rd Qu.:4750
## Max. :59.60 Max. :21.50 Max. :231.0 Max. :6300
## sex year
## Min. :1.00 Min. :0.0000
## 1st Qu.:1.00 1st Qu.:0.0000
## Median :1.00 Median :0.5000
## Mean :1.48 Mean :0.5145
## 3rd Qu.:2.00 3rd Qu.:1.0000
## Max. :2.00 Max. :1.0000
str(penguins)
## 'data.frame': 344 obs. of 6 variables:
## $ bill_length_mm : num 39.1 39.5 40.3 43.9 36.7 ...
## $ bill_depth_mm : num 18.7 17.4 18 17.2 19.3 ...
## $ flipper_length_mm: num 181 186 195 201 193 ...
## $ body_mass_g : num 3750 3800 3250 4202 3450 ...
## $ sex : num 1 2 2 1 2 1 2 1 1 1 ...
## $ year : num 0 0 0 0 0 0 0 0 0 0 ...
fviz_nbclust(
x = penguins, FUNcluster = pam, method = "wss", k.max = 15,
diss = dist(penguins, method = "manhattan")
)
matriz_distancias <- dist(x = penguins, method = "euclidean")
hc_euclidea_completo <- hclust(d = matriz_distancias, method = "complete")
hc_euclidea_single <- hclust(d = matriz_distancias, method = "single")
hc_euclidea_average <- hclust(d = matriz_distancias, method = "average")
grid.arrange(
plot(
x = hc_euclidea_completo, cex = 0.6, xlab = "", ylab = "", sub = "",
main = "Linkage complete"
),
plot(
x = hc_euclidea_single, cex = 0.6, xlab = "", ylab = "", sub = "",
main = "Linkage single"
),
plot(
x = hc_euclidea_average, cex = 0.6, xlab = "", ylab = "", sub = "",
main = "Linkage average"
),
nrow = 2
)
features <- penguins[, c("sex", "year", "bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g")]
scaled_features <- scale(features)
k <- 3
kmeans_result <- kmeans(scaled_features, centers = k)
penguins$cluster <- kmeans_result$cluster
plot(penguins[, c(1:6)], col = kmeans_result$cluster)
title("Pair Plot: Scatter Plots of Numeric Variables by Cluster")
fviz_cluster(
object = kmeans_result, data = penguins, show.clust.cent = TRUE,
ellipse.type = "euclid", star.plot = TRUE, repel = TRUE
) +
labs(title = "Resultados clustering K-means") +
theme_bw() +
theme(legend.position = "none")
hclust_result <- hclust(dist(scaled_features))
dend_colored <- color_branches(hclust_result, k = k)
labels(dend_colored) <- NULL
plot(dend_colored, cex = 0.6, main = "Hierarchical Clustering Dendrogram")
rect.hclust(hclust_result, k, border = "black")
grid.arrange(
fviz_dend(
x = hclust_result,
k = k,
color_labels_by_k = TRUE,
cex = 0.8,
type = "phylogenic",
repel = TRUE,
xlab = "",
ylab = ""
),
fviz_dend(
x = hclust_result,
k = k,
color_labels_by_k = TRUE,
cex = 0.5,
type = "circular",
),
nrow = 1
)
k <- 4
kmeans_result <- kmeans(scaled_features, centers = k)
penguins$cluster <- kmeans_result$cluster
plot(penguins[, c(1:6)], col = kmeans_result$cluster)
title("Pair Plot: Scatter Plots of Numeric Variables by Cluster")
fviz_cluster(
object = kmeans_result, data = penguins, show.clust.cent = TRUE,
ellipse.type = "euclid", star.plot = TRUE, repel = TRUE
) +
labs(title = "Resultados clustering K-means") +
theme_bw() +
theme(legend.position = "none")
plot_fviz_cluster <- function(features, dt, cents) {
plot <- fviz_cluster(
object = kmeans(features, centers = cents),
data = dt,
show.clust.cent = FALSE,
ellipse.type = "euclid", star.plot = TRUE, repel = TRUE,
show.dist = FALSE,
xlab = "",
ylab = ""
) +
labs(title = paste("K-means ", cents)) +
theme_bw() +
theme(legend.position = "none")
return(plot)
}
grid.arrange(
plot_fviz_cluster(scaled_features, penguins, 5),
plot_fviz_cluster(scaled_features, penguins, 6),
ncol = 2
)
grid.arrange(
plot_fviz_cluster(scaled_features, penguins, 7),
plot_fviz_cluster(scaled_features, penguins, 8),
ncol = 2
)
hclust_result <- hclust(dist(scaled_features))
dend_colored <- color_branches(hclust_result, k = k)
labels(dend_colored) <- NULL
plot(dend_colored, cex = 0.6, main = "Hierarchical Clustering Dendrogram")
rect.hclust(hclust_result, k, border = "black")
grid.arrange(
fviz_dend(
x = hclust_result,
k = k,
color_labels_by_k = TRUE,
cex = 0.8,
type = "phylogenic",
repel = TRUE,
xlab = "",
ylab = ""
),
fviz_dend(
x = hclust_result,
k = k,
color_labels_by_k = TRUE,
cex = 0.5,
type = "circular",
show_labels = FALSE
),
nrow = 1
)
dbscan_result <- dbscan(scaled_features, eps = 0.5, MinPts = 5)
fviz_cluster(
object = dbscan_result, data = scaled_features, stand = FALSE,
geom = "point", ellipse = TRUE, show.clust.cent = TRUE,
pallete = "jco",
main = "DBSCAN Clustering"
) +
theme_bw() +
theme(legend.position = "bottom")
gmm_result <- Mclust(scaled_features)
plot(gmm_result, what = "classification", main = "GMM Clustering")
agnes_result <- agnes(scaled_features)
fviz_dend(agnes_result,
main = "AGNES Clustering Dendrogram",
show_labels = FALSE,
palette = "jama",
k = 5,
color_labels_by_k = TRUE,
ggtheme = theme_classic()
)
optics_result <- optics(scaled_features, minPts = 5)
plot(optics_result, main = "OPTICS Clustering")
pam_result <- pam(dist(scaled_features, method = "euclidean"), 3)
clusplot(pam_result)
pam_result <- pam(dist(scaled_features, method = "euclidean"), 4)
clusplot(pam_result)
comparacion <- clValid(
obj = scaled_features,
nClust = 2:6,
clMethods = c("hierarchical", "kmeans", "pam", "agnes"),
validation = c("stability", "internal")
)
summary(comparacion)
##
## Clustering Methods:
## hierarchical kmeans pam agnes
##
## Cluster sizes:
## 2 3 4 5 6
##
## Validation Measures:
## 2 3 4 5 6
##
## hierarchical APN 0.0029 0.0571 0.0549 0.0674 0.1238
## AD 2.8952 2.5229 2.2262 2.0173 2.0090
## ADM 0.7357 0.1446 0.2327 0.2605 0.3709
## FOM 0.9791 0.7540 0.7212 0.6927 0.6929
## Connectivity 0.1000 3.0290 3.0290 3.0290 5.9579
## Dunn 0.2793 0.2793 0.3074 0.3276 0.3276
## Silhouette 0.3684 0.3177 0.3285 0.3522 0.3370
## kmeans APN 0.0346 0.0357 0.1608 0.1227 0.1397
## AD 2.5685 2.2502 2.1904 1.9511 1.8755
## ADM 0.1379 0.2187 0.5525 0.4489 0.4458
## FOM 0.8010 0.7467 0.7309 0.7017 0.7037
## Connectivity 0.1000 0.1000 22.5944 22.5944 25.3778
## Dunn 0.2793 0.2890 0.0751 0.0901 0.0901
## Silhouette 0.3684 0.3455 0.3296 0.3654 0.3500
## pam APN 0.3197 0.1342 0.0960 0.1521 0.2153
## AD 2.9847 2.3744 2.0505 1.9692 1.8878
## ADM 1.1915 0.5252 0.3418 0.4846 0.5989
## FOM 0.9691 0.8096 0.7188 0.7156 0.7008
## Connectivity 57.5786 8.1048 2.9290 31.1913 48.7794
## Dunn 0.0492 0.0725 0.2428 0.0858 0.1039
## Silhouette 0.3263 0.3424 0.3668 0.3629 0.3433
## agnes APN 0.0029 0.0571 0.0549 0.0674 0.1238
## AD 2.8952 2.5229 2.2262 2.0173 2.0090
## ADM 0.7357 0.1446 0.2327 0.2605 0.3709
## FOM 0.9791 0.7540 0.7212 0.6927 0.6929
## Connectivity 0.1000 3.0290 3.0290 3.0290 5.9579
## Dunn 0.2793 0.2793 0.3074 0.3276 0.3276
## Silhouette 0.3684 0.3177 0.3285 0.3522 0.3370
##
## Optimal Scores:
##
## Score Method Clusters
## APN 0.0029 hierarchical 2
## AD 1.8755 kmeans 6
## ADM 0.1379 kmeans 2
## FOM 0.6927 hierarchical 5
## Connectivity 0.1000 hierarchical 2
## Dunn 0.3276 hierarchical 5
## Silhouette 0.3684 hierarchical 2